org 100h

  mov al,13h
  int 10h
  push 0xa000
  pop es
  fninit

;Palette: diffuse + specular = L * clamp([0.25,H,1] + L^8/2)
  mov dx,3c8h
  xor ax,ax
  out dx,al
  inc dx
P:
 ;or bx,0b0000011100011111  ; bx = LLLLL... HHH.....
  or bx,0b0000000011111111  ; bx = LLLLLLLL ........

  push di    ; b=1
  push bx    ; g=H
  push 0x40  ; r=0.25

  mov al,bh
POW:
  mul al
  mov al,ah
  inc si
  jpo POW    ; 3 times
  xchg ax,cx
  shr cl,1   ; cl=L^8/2 (0..127)

MAD:
  pop ax     ; rgb
  add al,cl  ; al=L^8/2 + rgb
  jnc CLAMP
  salc       ; clamp to 255
CLAMP:
  mul bh     ; ah=L*clamp(L^8/2 + rgb)
  shr ax,10
  out dx,al
  dec si
  jpo MAD    ; 3 times

  inc bx
  jnz P

  ;scasw ; di=0
  xor di,di

;For each frame: compute rotation matrices and comstants
M fld dword[TIME]
  fadd dword[TIME_DELTA]
  fst dword[TIME]
  fsincos          ;| slowC slowS

  fldl2e           ;=1.442695
  fmul dword[TIME]
  fsincos          ;| fastC fastS slowC slowS

  fldz
  fchs
  fld dword[K0_25] ;bx-0x70 0x60 0x50 0x40  0x30  0x20  0x10  |0
  fld dword[K0_01] ;|  0.01 0.25 -0   fastC fastS slowC slowS |XY

;Store each constant four times (for SSE)
  mov bx,0xa000 - 0x70
STORE:
  mov cx,4
STORE4:
  fst dword[bx]
  add bl,4
  loop STORE4

  fstp st0
  jnz STORE        ; loop 7 times

%define EPS  [bx-0x70] ; 0.01
%define K    [bx-0x60] ; 0.25
%define NABS [bx-0x50] ; -0 = 0x80000000: for -abs()
%define COS  [bx-0x40]
%define SIN  [bx-0x30]

;For each pixel:
X mov bx,es
  mov cl,4
X4:
  mov ax,0xcccd
  mul di
  add dx,0x9b80
  mov [bx],ax
  mov [bx+2],dx
  add bl,4
  inc di
  loop X4      ; di+=4 bx=0xa010

  mov bx,es

%define INTX [bx-1]
%define INTY [bx]

%define x xmm0 ; inputs (destroyed)
%define y xmm1
%define z xmm2
%define o xmm3 ; output: orbit trap
%define a xmm4 ; scratch, output: estimated distance
%define b xmm5 ; scratch
%define R xmm6 ; translation radius

%define Z xmm7 ; depth


;Trace a ray for 30 steps
  sub di,4
Y4:
  mov cl,30
  fld dword[MINUS1] ;| Z=-1
T call FMAP         ;| map(X,Y,Z) Z
  faddp             ;| Z+=map(X,Y,Z)
  loop T
;  mov cl,30
;  movaps Z,[MINUS1] ; Z=-1
;T call FMAP         ;| map(X,Y,Z) Z
;  addps Z,a         ; Z+=map(X,Y,Z)
;  loop T

;Estimate the normal
  call FMAP         ;| map(X,Y,Z) Z
  fxch st1          
  fsub dword EPS    ;| Z-EPS map(X,Y,Z)
  call FMAP         ;| map(X,Y,Z-EPS) Z-EPS map(X,Y,Z)
  fstp st1
  fsubrp st1,st0
  fdiv dword EPS    ;| (map(X,Y,Z-EPS) - map(X,Y,Z)) / EPS
;  call FMAP
;  movaps o,a
;  subps Z,EPS
;  call FMAP        ; a = map(X,Y,Z-EPS)
;  subps a,o        ; a = map(X,Y,Z-EPS) - map(X,Y,Z)
;  divps a,EPS      ; a = (map(X,Y,Z-EPS) - map(X,Y,Z)) / EPS

;Fog
;  minps Z,K1       ; Z = min(Z,1)
;  mulps a,Z        ; a *= Z

;Store pixel
  fmul dword[K255]
  fistp word[bp]
  mov ax,[bp]
  test ax,ax
  jns POS
  xor ax,ax
POS:
  stosb
  add bl,4
  cmp bl,16
  jne Y4
;  mulps a,[K255]
;  cvtps2dq a,a
;  packssdw a,a
;  packuswb a,a     ; clamp to a byte 0..255
;  movd [es:di-4],a

;  movd eax,a
;  movaps a,o
;  xchg eax,ebp
;  xor bx,32
;  jpo STORE
;  lea eax,[ebp*8+eax]
;  stosd

  test di,di
  jnz X

  in al,0x60
  dec al
  jnz M

  mov ax,3
  int 10h
  ret

  
  
  
FMAP:
  fild dword INTX
  fmul dword[SCALE]
  fmul st0            ;| x*x Z
  fild dword INTY
  fmul dword[SCALE]
  fmul st0         
  faddp               ;| x*x+y*y Z
  fld st1
  fmul st0
  faddp               ;| x*x+y*y+z*z Z
  fsqrt               ;| sqrt(x*x + y*y + z*z) Z
  fsub dword K        ;| sqrt(x*x + y*y + z*z)-0.25 Z
  ret
;  movups x,INTX
;  cvtdq2ps x,x
;  mulps x,[SCALE]
;  mulps x,x
;  cvtdq2ps y,INTY
;  mulps y,[SCALE]
;  mulps y,y
;  addps x,y
;  movaps z,Z
;  mulps z,z
;  addps x,z
;  rsqrtps a,x  ; a=(length^2)^(-1/2)
;  mulps a,x    ; a=(length^2)^(-1/2 + 1) = length
;  subps a,K
;  ret


MAP:
  movups x,INTX
  cvtdq2ps y,INTY
  cvtdq2ps x,x ; x,y,z = X,Y,Z
  mulps x,[SCALE]
  mulps y,[SCALE]
  movaps z,Z

;;  xorps o,o    ; o=0
;  movaps R,K   ; R=K: translation = [R,R/4,0]
;  mov ch,19    ; do 19 iterations
;
;;Rotate in the xz and yx planes
;L movaps b,SIN ; b=fastS a=fastC | b=slowS a=slowC 
;  movaps a,COS
;  mulps b,z    ; b=Sz
;  mulps z,a    ; z=Cz
;  mulps a,x    ; a=Cx
;  mulps x,SIN  ; x=Sx
;  addps a,b    ; a=x'=Cx+Sz
;  subps z,x    ; z=z'=Cz-Sx
;
;  movaps x,y   ; cycle x,y,z <- y,z,a
;  movaps y,z
;  movaps z,a
;  xor bx,0x20  ; 0xa000 | 0xa020
;  jpo L
;
;;Reflect along x and y
;  orps x,NABS  ; x=-abs(x)
;  orps y,NABS  ; y=-abs(y)
;
;;Translate
;  movaps a,R
;  mulps a,K    ; a=K*R
;  addps x,R    ; x+=R
;  addps y,a    ; y+=K*R
;
;  subps R,a    ; R*=1-K: scale translation vector

;Squared distance to [0,0,0]
  movaps a,x
  movaps b,y
  mulps a,a    ; a=x*x
  mulps b,b    ; b=y*y
  addps b,a    ; b=x*x+y*y
  movaps a,z
  mulps a,a    ; a=z*z
  addps b,a    ; b=length^2=x*x+y*y+z*z

;;Orbit trap
;;  maxps o,b    ; o = max(o,length^2)
;
;;Iterate 19 times
;  dec ch
;  jnz L

;Compute the distance to a little sphere
;  sqrtps a,b   ; a=length
  rsqrtps a,b  ; a=(length^2)^(-1/2)
  mulps a,b    ; a=(length^2)^(-1/2 + 1) = length

;  subps a,R
  subps a,R    ; a=length-2R: offset by 2*radius
  minps a,K    ; a=min(length-2R,K): limit distance
  ret

K0_25 dd 0.25
K0_01 dd 0.01
align 16
MINUS1 dd -1.0,-1.0,-1.0,-1.0
SCALE dd 2.3283064e-10,2.3283064e-10,2.3283064e-10,2.3283064e-10  ; 2^-32
K255 dd 255.0,255.0,255.0,255.0

TIME dd 0
TIME_DELTA dd 0.01
KK dd 0.25
